/* һ͸ɫ 16/15-bit Bitmap,
  κβ͸ɫ
  Ŀǰ֧־ż
  кʹMMXָ, ΪٶȿӦΪ4ı

  ֻھMMXCPU 
*/

#include "gl.h"
//#include "gengine.h"
#include "sprite.h"

#undef _SPRITE_BLIT_BETA_MMX_HEAD

// ͸ɫ, mm7
extern DWORD dwColorKey;
extern QWORD qwColorKey;
// 
extern DWORD dwAlphaMask1, dwAlphaMask2, dwAlphaMask4;
extern QWORD qwAlphaMask1, qwAlphaMask2, qwAlphaMask4;

void SpriteBlitBeta0MMX( Bitmap* dest, int x, int y, Bitmap* src )
{
	SPRITE_CLIP;

	int spitch = src->pitch - w*2;
	char* sline = src->line[sy] + sx*2;
	int dpitch = dest->pitch - w*2;
	char* dline = dest->line[y] + x*2;

	__asm{
		mov esi, sline;	
		mov edi, dline;
		cld;
		movq mm7, qwColorKey;
		mov edx, spitch;
		mov ecx, w;
		mov bx, word ptr dwColorKey;
line_begin:
		ALIGN 4;
		cmp ecx, 4;
		jl line_tail;

		movq mm0, [esi];
		movq mm3, [edi];
		movq mm2, mm0;
		sub ecx, 4;
		pcmpeqw mm2, mm7;
		add esi, 8;
		pand mm2, mm3;
		movq [edi], mm2;
		add edi, 8;
		jmp line_begin;

line_tail:
		jecxz next_line;
next_pixel:
		lodsw;
		dec ecx;
		cmp ax, bx;
		je trans_pixel;
		xor ax, ax;
		stosw;
		test ecx, ecx;
		jnz next_pixel;
		jmp next_line;
trans_pixel:
		add edi, 2;
		test ecx, ecx;
		jnz next_pixel;
next_line:
		add edi, dpitch;
		add esi, edx;
		mov ecx, w;
		dec h;
		jnz line_begin;

		emms;
		}
}

void SpriteBlitBeta1MMX( Bitmap* dest, int x, int y, Bitmap* src )
{
	SPRITE_CLIP;

#define _SPRITE_BLIT_BETA_MMX_HEAD 0
#include "sprite_b.hpp"
#undef _SPRITE_BLIT_BETA_MMX_HEAD

		movq mm6, qwAlphaMask1;
		mov dx, word ptr dwAlphaMask1;

#define _SPRITE_BLIT_BETA_MMX_HEAD 1
#include "sprite_b.hpp"
#undef _SPRITE_BLIT_BETA_MMX_HEAD

		psrlq mm0, 3;
		pand mm4, mm2;
		pand mm0, mm6;
		add esi, 8;
		pandn mm2, mm0;
		sub ecx, 4;
		por mm2, mm4;

#define _SPRITE_BLIT_BETA_MMX_HEAD 2
#include "sprite_b.hpp"
#undef _SPRITE_BLIT_BETA_MMX_HEAD

		shr ax, 3;
		and ax, dx;

#define _SPRITE_BLIT_BETA_MMX_HEAD 3
#include "sprite_b.hpp"
#undef _SPRITE_BLIT_BETA_MMX_HEAD

}

void SpriteBlitBeta2MMX( Bitmap* dest, int x, int y, Bitmap* src )
{
	SPRITE_CLIP;

#define _SPRITE_BLIT_BETA_MMX_HEAD 0
#include "sprite_b.hpp"
#undef _SPRITE_BLIT_BETA_MMX_HEAD

		movq mm6, qwAlphaMask2;
		mov dx, word ptr dwAlphaMask2;

#define _SPRITE_BLIT_BETA_MMX_HEAD 1
#include "sprite_b.hpp"
#undef _SPRITE_BLIT_BETA_MMX_HEAD

		psrlq mm0, 2;
		pand mm4, mm2;
		pand mm0, mm6;
		add esi, 8;
		pandn mm2, mm0;
		sub ecx, 4;
		por mm2, mm4;

#define _SPRITE_BLIT_BETA_MMX_HEAD 2
#include "sprite_b.hpp"
#undef _SPRITE_BLIT_BETA_MMX_HEAD

		shr ax, 2;
		and ax, dx;

#define _SPRITE_BLIT_BETA_MMX_HEAD 3
#include "sprite_b.hpp"
#undef _SPRITE_BLIT_BETA_MMX_HEAD

}

void SpriteBlitBeta3MMX( Bitmap* dest, int x, int y, Bitmap* src )
{
	SPRITE_CLIP;

#define _SPRITE_BLIT_BETA_MMX_HEAD 0
#include "sprite_b.hpp"
#undef _SPRITE_BLIT_BETA_MMX_HEAD

		movq mm6, qwAlphaMask1;
		movq mm5, qwAlphaMask4;

#define _SPRITE_BLIT_BETA_MMX_HEAD 1
#include "sprite_b.hpp"
#undef _SPRITE_BLIT_BETA_MMX_HEAD

		psrlq mm0, 3;
		psrlq mm3, 1;
		pand mm0, mm6;
		pand mm3, mm5;
		pand mm4, mm2;
		psubd mm3, mm0;
		add esi, 8;
		pandn mm2, mm3;
		sub ecx, 4;
		por mm2, mm4;

#define _SPRITE_BLIT_BETA_MMX_HEAD 2
#include "sprite_b.hpp"
#undef _SPRITE_BLIT_BETA_MMX_HEAD

		mov dx, ax;
		shr ax, 1;
		shr dx, 3;
		and ax, word ptr dwAlphaMask4;
		and dx, word ptr dwAlphaMask1;
		sub ax, dx;

#define _SPRITE_BLIT_BETA_MMX_HEAD 3
#include "sprite_b.hpp"
#undef _SPRITE_BLIT_BETA_MMX_HEAD

}

void SpriteBlitBeta4MMX( Bitmap* dest, int x, int y, Bitmap* src )
{
	SPRITE_CLIP;

	int spitch = src->pitch - w*2;
	char* sline = src->line[sy] + sx*2;
	int dpitch = dest->pitch - w*2;
	char* dline = dest->line[y] + x*2;

	__asm{
		mov esi, sline;	
		mov edi, dline;
		cld;
		movq mm7, qwColorKey;
		movq mm6, qwAlphaMask4;
		mov ecx, w;
		mov bx, word ptr dwColorKey;
		mov dx, word ptr dwAlphaMask4;
line_begin:
		ALIGN 4;
		cmp ecx, 4;
		jl line_tail;

		movq mm0, [esi];
		movq mm3, [edi];
		movq mm1, mm0;
		movq mm2, mm0;
		pcmpeqw mm1, mm7;
		psrlq mm0, 1;
		pcmpeqw mm2, mm7;

		pand mm0, mm6;
		pand mm2, mm3;
		pandn mm1, mm0;
		sub ecx, 4;
		por mm2, mm1;
		add esi, 8;
		movq [edi], mm2;
		add edi, 8;
		jmp line_begin;

line_tail:
		jecxz next_line;
next_pixel:
		lodsw;
		dec ecx;
		cmp ax, bx;
		je trans_pixel;
		shr ax, 1;
		and ax, dx;
		stosw;
		test ecx, ecx;
		jnz next_pixel;
		jmp next_line;
trans_pixel:
		add edi, 2;
		test ecx, ecx;
		jnz next_pixel;
next_line:
		add esi, spitch;
		add edi, dpitch;
		mov ecx, w;
		dec h;
		jnz line_begin;

		emms;
		}
}

void SpriteBlitBeta5MMX( Bitmap* dest, int x, int y, Bitmap* src )
{
	SPRITE_CLIP;

#define _SPRITE_BLIT_BETA_MMX_HEAD 0
#include "sprite_b.hpp"
#undef _SPRITE_BLIT_BETA_MMX_HEAD

		movq mm6, qwAlphaMask1;
		movq mm5, qwAlphaMask4;

#define _SPRITE_BLIT_BETA_MMX_HEAD 1
#include "sprite_b.hpp"
#undef _SPRITE_BLIT_BETA_MMX_HEAD

		psrlq mm0, 1;
		psrlq mm3, 3;
		pand mm0, mm5;
		pand mm3, mm6;
		pand mm4, mm2;
		paddd mm0, mm3;
		add esi, 8;
		pandn mm2, mm0;
		sub ecx, 4;
		por mm2, mm4;

#define _SPRITE_BLIT_BETA_MMX_HEAD 2
#include "sprite_b.hpp"
#undef _SPRITE_BLIT_BETA_MMX_HEAD

		mov dx, ax;
		shr ax, 1;
		shr dx, 3;
		and ax, word ptr dwAlphaMask4;
		and dx, word ptr dwAlphaMask1;
		add ax, dx;

#define _SPRITE_BLIT_BETA_MMX_HEAD 3
#include "sprite_b.hpp"
#undef _SPRITE_BLIT_BETA_MMX_HEAD

}

void SpriteBlitBeta6MMX( Bitmap* dest, int x, int y, Bitmap* src )
{
	SPRITE_CLIP;

#define _SPRITE_BLIT_BETA_MMX_HEAD 0
#include "sprite_b.hpp"
#undef _SPRITE_BLIT_BETA_MMX_HEAD

		movq mm6, qwAlphaMask2;
		movq mm5, qwAlphaMask4;

#define _SPRITE_BLIT_BETA_MMX_HEAD 1
#include "sprite_b.hpp"
#undef _SPRITE_BLIT_BETA_MMX_HEAD

		/*psrlq mm0, 1;
		psrlq mm3, 2;
		pand mm0, mm5;
		pand mm3, mm6;
		pand mm4, mm2;
		paddd mm0, mm3;
		add esi, 8;
		pandn mm2, mm0;
		sub ecx, 4;
		por mm2, mm4;
*/
		psrlq mm3, 2;
		pand mm4, mm2;
		pand mm3, mm6;
		psubw mm0, mm3;
		add esi, 8;
		pandn mm2, mm0;
		sub ecx, 4;
		por mm2, mm4;

#define _SPRITE_BLIT_BETA_MMX_HEAD 2
#include "sprite_b.hpp"
#undef _SPRITE_BLIT_BETA_MMX_HEAD

		mov dx, ax;
		shr ax, 1;
		shr dx, 2;
		and ax, word ptr dwAlphaMask4;
		and dx, word ptr dwAlphaMask2;
		add ax, dx;

#define _SPRITE_BLIT_BETA_MMX_HEAD 3
#include "sprite_b.hpp"
#undef _SPRITE_BLIT_BETA_MMX_HEAD

}

void SpriteBlitBeta7MMX( Bitmap* dest, int x, int y, Bitmap* src )
{
	SPRITE_CLIP;

#define _SPRITE_BLIT_BETA_MMX_HEAD 0
#include "sprite_b.hpp"
#undef _SPRITE_BLIT_BETA_MMX_HEAD

		movq mm6, qwAlphaMask1;

#define _SPRITE_BLIT_BETA_MMX_HEAD 1
#include "sprite_b.hpp"
#undef _SPRITE_BLIT_BETA_MMX_HEAD

		psrlq mm0, 3;
		pand mm0, mm6;
		pand mm4, mm2;
		psubd mm3, mm0;
		add esi, 8;
		pandn mm2, mm3;
		sub ecx, 4;
		por mm2, mm4;

#define _SPRITE_BLIT_BETA_MMX_HEAD 2
#include "sprite_b.hpp"
#undef _SPRITE_BLIT_BETA_MMX_HEAD

		mov dx, ax;
		shr dx, 3;
		and dx, word ptr dwAlphaMask1;
		sub ax, dx;

#define _SPRITE_BLIT_BETA_MMX_HEAD 3
#include "sprite_b.hpp"
#undef _SPRITE_BLIT_BETA_MMX_HEAD

}

void SpriteBlitBeta8MMX( Bitmap* dest, int x, int y, Bitmap* src )
{
	SPRITE_CLIP;

	int spitch = src->pitch - w*2;
	char* sline = src->line[sy] + sx*2;
	int dpitch = dest->pitch - w*2;
	char* dline = dest->line[y] + x*2;

	__asm{
		mov esi, sline;	
		mov edi, dline;
		cld;
		movq mm7, qwColorKey;
		mov edx, spitch;
		mov ecx, w;
		mov bx, word ptr dwColorKey;
line_begin:
		ALIGN 4;
		cmp ecx, 4;
		jl line_tail;

		movq mm0, [esi];
		movq mm3, [edi];
		movq mm2, mm0;
		movq mm1, mm0;
		pcmpeqw mm2, mm7;
		pcmpeqw mm1, mm7;

		pand mm2, mm3;
		pandn mm1, mm0;
		sub ecx, 4;
		por mm2, mm1;
		add esi, 8;
		movq [edi], mm2;
		add edi, 8;
		jmp line_begin;

line_tail:
		jecxz next_line;
next_pixel:
		lodsw;
		dec ecx;
		cmp ax, bx;
		je trans_pixel;
		stosw;
		test ecx, ecx;
		jnz next_pixel;
		jmp next_line;
trans_pixel:
		add edi, 2;
		test ecx, ecx;
		jnz next_pixel;
next_line:
		add edi, dpitch;
		add esi, edx;
		mov ecx, w;
		dec h;
		jnz line_begin;

		emms;
		}
}

// 32
void SpriteBlitBeta9MMX( Bitmap* dest, int x, int y, Bitmap* src, int brightness )
{
	SPRITE_CLIP;

	int spitch = src->pitch - w*2;
	char* sline = src->line[sy] + sx*2;
	int dpitch = dest->pitch - w*2;
	char* dline = dest->line[y] + x*2;

	__asm{
		lea eax, colorMask;
		shr brightness, 3;
		movq mm4, [eax];
		movq mm5, [eax+8];
		movq mm6, [eax+16];
		mov esi, sline;	
		mov edi, dline;
		mov eax, brightness;
		cld;
		mov ebx, eax;
		shl eax, 16;
		mov ecx, w;
		mov ax, bx;
		movd mm7, eax;
		movq mm0, mm7;
		punpcklwd mm0, mm7;	//brightness is in mm0
line_begin:
		ALIGN 4;
		cmp ecx, 4;
		jl line_tail;

		movq mm1, [esi];
		movq mm7, mm1;
		movq mm3, mm1;
		movq mm2, mm1;
		pcmpeqw mm7, qwColorKey;
		pand mm3, mm6;
		pand mm2, mm5;
		pmullw mm3, mm0;
		psrlw mm2, 5;
		pand mm1, mm4;
		pmullw mm2, mm0;
		psrlw mm1, 5;
		pand mm2, mm5;
		psrlw mm3, 5;
		pmullw mm1, mm0;
		por mm2, mm3;
		pand mm1, mm4;
		sub ecx, 2;
		por mm1, mm2;
		movq mm3, mm7;
		movq mm2, [edi];
		pandn mm3, mm1;
		pand mm2, mm7;
		sub ecx, 2;
		por mm2, mm3;
		add esi, 8;
		movq [edi], mm2;

		add edi, 8;
		jmp line_begin;

line_tail:
		jecxz next_line;
next_pixel:
		lodsw;
		dec ecx;
		cmp ax, word ptr dwColorKey;
		je trans_pixel;
		mov ebx, eax;
		and eax, gMask;
		and ebx, rbMask;
		shl eax, 16;
		or  eax, ebx;
		mul brightness;
		shr eax, 5;
		mov ebx, eax;
		shr eax, 16;
		and ebx, rbMask;
		and eax, gMask;
		or eax, ebx;
		stosw;
		test ecx, ecx;
		jnz next_pixel;
		jmp next_line;
trans_pixel:
		add edi, 2;
		test ecx, ecx;
		jnz next_pixel;
next_line:
		add edi, dpitch;
		add esi, spitch;
		mov ecx, w;
		dec h;
		jnz line_begin;

		emms;
		}
}

void SpriteBlitBeta10MMX( Bitmap* dest, int x, int y, Bitmap* src, int b1, int b2, short* buf )
{
	SPRITE_CLIP;

	int spitch = src->pitch - w*2;
	char* sline = src->line[sy] + sx*2;
	int dpitch = dest->pitch - w*2;
	char* dline = dest->line[y] + x*2;

	__asm{
		mov edx, src;	//calculate brightness
		cld;
		mov edi, buf;
		mov ebx, b1;
		mov eax, b2;
		mov ecx, [edx]src.width;
		mov esi, edi;
		shr b2, 3;
		sub eax, ebx;
		jz set_same_bright;
		cdq;
		shl eax, 16;
		shl ebx, 16;
		idiv ecx;
		xchg eax, ebx;
		dec ecx;
		shl ebx, 1;
		mov edx, eax;
set_brightness:
		shr eax, 19;
		stosw;
		mov eax, edx;
		add eax, ebx;
		mov edx, eax;
		loop set_brightness;
		mov eax, b2;
		stosw;
		jmp set_bright_end;
set_same_bright:
		mov eax, ebx;
		shr eax, 3;
		repnz stosw;
set_bright_end:

		mov esi, buf;
		mov edx, sx;
		lea eax, colorMask;
		shl edx, 1;
		movq mm4, [eax];
		add edx, esi;
		movq mm5, [eax+8];
		movq mm6, [eax+16];
		mov esi, sline;	
		mov edi, dline;
		mov ecx, w;
		mov buf, edx;
line_begin:
		ALIGN 4;
		cmp ecx, 4;
		jl line_tail;

		movq mm1, [esi];
		movq mm0, [edx];
		movq mm7, mm1;
		movq mm3, mm1;
		movq mm2, mm1;
		pcmpeqw mm7, qwColorKey;
		pand mm3, mm6;
		pand mm2, mm5;
		pmullw mm3, mm0;
		psrlw mm2, 5;
		pand mm1, mm4;
		pmullw mm2, mm0;
		psrlw mm1, 5;
		pand mm2, mm5;
		psrlw mm3, 5;
		pmullw mm1, mm0;
		por mm2, mm3;
		pand mm1, mm4;
		add edx, 8;
		por mm1, mm2;
		movq mm3, mm7;
		movq mm2, [edi];
		pandn mm3, mm1;
		pand mm2, mm7;
		sub ecx, 4;
		por mm2, mm3;
		add esi, 8;
		movq [edi], mm2;

		add edi, 8;
		jmp line_begin;

line_tail:
		jecxz next_line;
next_pixel:
		lodsw;
		dec ecx;
		cmp ax, word ptr dwColorKey;
		je trans_pixel;
		mov ebx, eax;
		and eax, gMask;
		and ebx, rbMask;
		shl eax, 16;
		or  eax, ebx;
		mul b2;
		shr eax, 5;
		mov ebx, eax;
		shr eax, 16;
		and ebx, rbMask;
		and eax, gMask;
		or eax, ebx;
		stosw;
		test ecx, ecx;
		jnz next_pixel;
		jmp next_line;
trans_pixel:
		add edi, 2;
		test ecx, ecx;
		jnz next_pixel;
next_line:
		mov edx, buf;
		add edi, dpitch;
		add esi, spitch;
		mov ecx, w;
		dec h;
		jnz line_begin;

		emms;
		}
}
